###########################################################################################
##### This file is used to extract data from two existing data sets, UCDP-GED and GTD #####
###########################################################################################
rm(list = ls())
library(plyr)
library(meltt)
library(foreign)
library(readxl)
library(readstata13)

# ---------- load the violence data ----------
# UCDP-GED
GED <- read.csv("../datasets/0_GED191.csv", stringsAsFactors = FALSE)
GEDsyria <- read.csv("../datasets/0_GED_syria.csv", stringsAsFactors = FALSE)
# Syria = 652
library("tibble")
GEDsyria <- add_column(GEDsyria, 652, .before = "side_a")
colnames(GEDsyria)[colnames(GEDsyria) == "652"] <- "gwnoa"
all(colnames(GED) == colnames(GEDsyria))
GED <- rbind(GED, GEDsyria)
GED2 <- GED
GED2$side_a <- GED2$side_b
GED <- rbind(GED, GED2)
GEDgroupfull <- c("al-Qaida",
                  "AQAP",
                  "AQIM",
                  "Al-Shabaab",
                  "ASL",
                  "Ansar al-Islam",
                  "Ansar al-Sunnah",
                  "Ahrar al-Sham",
                  "HTS",
                  "Taleban",
                  "IMU",
                  "IS",
                  "Jam'iyyat-i Islami-yi Afghanistan",
                  "Jaysh al-Islam",
                  "1920 Revolution Brigades",
                  "JRTN",
                  "Tanzim Hurras ad-Din", 
                  "TTP", 
                  "ETIM")
GEDgrouppartial <- c("Jabhat Fateh al-Sham")
GED <- GED[GED$side_a %in% GEDgroupfull | grepl(GEDgrouppartial, GED$side_a),]

# GTD
GTD <- read_excel("../datasets/0_globalterrorismdb_0919dist.xlsx")
GTD93 <- read_excel("../datasets/0_GTD1993_0919dist.xlsx")
GTD <- rbind(GTD, GTD93)
GTD2 <- GTD
GTD2$gname <- GTD2$gname2
GTD <- rbind(GTD, GTD2)
GTDgroupfull <- c("Al-Shabaab",
                  "Ansar al-Sharia (Libya)",
                  "Ansar al-Islam",
                  "Ansar al-Sunna",
                  "Ansar Ghazwat-ul-Hind",
                  "Chechen Rebels",
                  "Haqqani Network",
                  "Ahrar al-Sham",
                  "Hay'at Tahrir al-Sham",
                  "Islamic Army in Iraq (al-Jaish al-Islami fi al-Iraq)",
                  "Taliban",
                  "Islamic Movement of Uzbekistan (IMU)",
                  "Sinai Province of the Islamic State",
                  "Al-Nusrah Front",
                  "Jamiat-e Islami-yi Afghanistan",
                  "Jaysh al-Islam (Syria)",
                  "1920 Revolution Brigades",
                  "Al-Naqshabandiya Army",
                  "Sipah-e-Sahaba/Pakistan (SSP)",
                  "Tehrik-e-Taliban Islami (TTI)",
                  "Eastern Turkistan Islamic Movement (ETIM)",
                  "Turkestan Islamic Party")
GTDgrouppartial <- c("Al-Qaida", "Islamic State")
GTD <- GTD[GTD$gname %in% GTDgroupfull | grepl(GTDgrouppartial[1], GTD$gname) | grepl(GTDgrouppartial[2], GTD$gname), ]

# ---------- Use MELTT to merge the two data sets ----------
## Location: latitude; longitude 

## Time: convert time to date
GED$date <- as.Date(GED$date_start, "%Y-%m-%d")
GTD$date <- paste0(as.character(GTD$iyear), ifelse(nchar(as.character(GTD$imonth)) == 2, as.character(GTD$imonth), paste0("0", as.character(GTD$imonth))),
                   ifelse(nchar(as.character(GTD$iday)) == 2, as.character(GTD$iday), paste0("0", as.character(GTD$iday))))
GTD$date <- as.Date(GTD$date, "%Y%m%d")

## Type (variables need taxonomies): event; group (actor); precision
## generate taxonomies for the Type variables
#  1) targtype1_txt (GTD); type_of_violence (GED); event_type (ACLED)
#  note: I use targtype1_txt instead of attacktype from GTD because type_of_violence from GED (state-based, non-state, one-sided) is defined by the target.
GED$event_tax <- GED$type_of_violence
GTD$event_tax <- GTD$targtype1_txt

event_tax <- data.frame(data.source = c(rep("GED", 3), rep("GTD", 21)),
                        base.categories = c("1","2","3","Government (General)","Government (Diplomatic)",
                                            "Private Citizens & Property","Terrorists/Non-State Militia",
                                            "Police","Military","Business","Utilities","Maritime","NGO",                           
                                            "Journalists & Media","Transportation","Telecommunication",
                                            "Religious Figures/Institutions","Airports & Aircraft",
                                            "Educational Institution","Unknown","Tourists",
                                            "Food or Water Supply","Violent Political Party","Other"),
                        event_level1 = c("1","2","3","1","1","3","2","1","1","3","1","1","3","3","1","1","3","1","3",NA,"3","3","2",NA))

#  2) group (actor)
# ** Each event has two actors; datasets have been reformed at the event-actor level
GED$group_tax <- GED$side_a
GTD$group_tax <- GTD$gname
group_tax <- data.frame(data.source = c(rep("GED", length(unique(GED$group_tax))), 
                                        rep("GTD", length(unique(GTD$group_tax)))),
                        base.categories = c(sort(unique(GED$group_tax)), 
                                            sort(unique(GTD$group_tax))),
                        specif_level1 = c(#GED
                          "Kata'ib Thawrat al-'Ashirin",rep("Harakat Ahrar al-Sham al-Islamiyyah",6),"Jabhat Fateh al-Sham",
                          "al-Shabaab","Ansar al-Islam","Ansar al-Sunna","al Qaeda in the Arabian Peninsula",
                          "al Qaeda in the Islamic Maghreb","Ansar al Sharia in Libya","Turkistan Islamic Party",
                          rep("Jabhat Fateh al-Sham",2),"Hayyat Tahrir al-Sham","Islamic Movement of Uzbekistan",
                          "Islamic State",rep("Jabhat Fateh al-Sham",2),"Jamiat-e-Islami","Jaysh al-Islam","Naqsbandi Army",
                          "Islamic Emirate of Afghanistan","Tanzim Huras al-Din","Tehreek-e-Taliban Islami Pakistan",
                          #GTD
                          "Kata'ib Thawrat al-'Ashirin","Islamic State","Harakat Ahrar al-Sham al-Islamiyyah","Islamic State",
                          "Naqsbandi Army","Jabhat Fateh al-Sham","al Qaeda","al Qaeda in Iraq","al Qaeda",
                          "al Qaeda in Saudi Arabia","al Qaeda in the Arabian Peninsula",                                     
                          "al Qaeda in the Indian Subcontinent","al Qaeda in the Islamic Maghreb",rep("al Qaeda",4),
                          "al-Shabaab","Islamic State","Ansar al-Islam","Ansar al Sharia in Libya","Ansar al-Sunna",
                          "Ansar Ghazwat ul-Hind / Foundation of New Movement of Jihad in Kashmir",
                          rep("Islamic State",3),"Chechen Jihadists","Islamic State","Turkistan Islamic Party",
                          rep("Islamic State",2),"Haqqani network","Hayyat Tahrir al-Sham","Islamic State",
                          "Islamic Army in Iraq","Islamic Movement of Uzbekistan",rep("Islamic State",5),"al Qaeda",
                          "Jamiat-e-Islami","Jaysh al-Islam",rep("Islamic State",6),"Islamic State-affiliated group",
                          "Sipah-e-Sahaba Pakistan",rep("Islamic State",2),"al Qaeda","Islamic Emirate of Afghanistan",
                          "Tehreek-e-Taliban Islami Pakistan","Islamic State","Turkistan Islamic Party"))

#  3) specificit (GTD); where_prec (GED)
GTD$prec_tax <- GTD$specificity
GED$prec_tax <- GED$where_prec
prec_tax <- data.frame(data.source = c(rep("GED", 7), rep("GTD", 6)),
                       base.categories = c("1","2","3","4","5","6","7","1","2","3","4","5",NA),
                       specif_level1 = c("exact","close","smadmin","lgadmin","imprec","country","notknown",
                                         "exact","smadmin","smadmin","lgadmin","notknown",NA),
                       specif_level2 = c("precise","precise","imprecise","imprecise","imprecise","imprecise","imprecise",
                                         "precise","imprecise","imprecise","imprecise","imprecise",NA))

#  convert taxonomies to a list
taxonomies = list(event_tax, group_tax, prec_tax)
#  **name the list
names(taxonomies) <- c("event_tax", "group_tax", "prec_tax")

## merge the two datasets
#  **get rid of the NAs in date (and location)
GED <- GED[!is.na(GED$date),]
GTD <- GTD[!is.na(GTD$date),]
GTD <- GTD[(!is.na(GTD$latitude))&(!is.na(GTD$latitude)),]

## merge
merged <- meltt(GED, GTD, taxonomies = taxonomies,
                twindow = 5, spatwindow = 10)

## get the deduplicated data
dupli <- meltt_duplicates(merged)
length(unique(dupli$GED_id))
length(unique(dupli$GTD_eventid))
# 1) GED: retain all
# 2) GTD: remove obs duplicated with GED
GTDrm <- dupli$GTD_eventid[!is.na(dupli$GTD_eventid) & !is.na(dupli$GED_id)]
GTD <- GTD[!(GTD$eventid %in% GTDrm),]

## retain the useful covariates
# 1) GED
colnames(GED)
GED2 <- GED[,c("date", "latitude", "longitude", "country",
               "group_tax", "best", "deaths_civilians", 
               "event_tax")]
head(GED2)
GED2$group <- mapvalues(GED2$group_tax, 
                        from = as.character(group_tax$base.categories[group_tax$data.source == "GED"]),
                        to = as.character(group_tax$specif_level1[group_tax$data.source == "GED"]))
GED2$attack_type <- mapvalues(GED2$event_tax,
                              from = event_tax$base.categories[event_tax$data.source == "GED"],
                              to = event_tax$event_level1[event_tax$data.source == "GED"])
GED2 <- cbind("GED", GED2)
colnames(GED2) <- c("source", "date", "latitude", "longitude", "country", 
                    "group_org", "deaths", "deaths_civilians", "event_org",
                    "group", "attack_type")

# 2) GTD
colnames(GTD)
GTD2 <- GTD[,c("date", "latitude", "longitude", "country_txt",
               "group_tax", "nkill", "nkillter", #civilian = nkill-nkillter
               "event_tax")]
GTD2$deaths_civilians <- GTD2$nkill - GTD2$nkillter
head(GTD2)
GTD2$group <- mapvalues(GTD2$group_tax, 
                        from = as.character(group_tax$base.categories[group_tax$data.source == "GTD"]),
                        to = as.character(group_tax$specif_level1[group_tax$data.source == "GTD"]))
GTD2$attack_type <- mapvalues(GTD2$event_tax,
                              from = event_tax$base.categories[event_tax$data.source == "GTD"],
                              to = event_tax$event_level1[event_tax$data.source == "GTD"])
GTD2 <- cbind("GTD", GTD2)
colnames(GTD2) <- c("source", "date", "latitude", "longitude", "country", 
                    "group_org", "deaths", "deaths_group", "event_org",  
                    "deaths_civilians", "group", "attack_type")
GTD2 <- GTD2[,c("source", "date", "latitude", "longitude", "country", 
                "group_org", "deaths", "deaths_civilians", "event_org",  
                "group", "attack_type")]

merged_violence <- rbind(GED2, GTD2)

write.csv(merged_violence, 
          file = "../datasets/0_merged_violence.csv",
          row.names = FALSE)
